import sys
print sys.version
from joblib import Parallel, delayed
import multiprocessing
nCores = multiprocessing.cpu_count() - 2 # Allow other apps to run
print 'nCores: %d' % (nCores)
from datetime import datetime, time
print 'now: %s' % str(datetime.now())
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.lib import grid
from rpy2.robjects.lib import ggplot2
import rpy2.robjects.pandas2ri
import numpy as np
import os
import pandas as pd
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from sklearn.linear_model import LogisticRegression
import tarfile
%run img_utils.py
The specs should be in img_glbSpecs_SFDD
%run img_glbSpec_SFDD_ImgSz_64.py
#print 'glbDataFile: %s' % (glbDataFile)
print 'glbImg: %s' % (glbImg)
print 'glbRspClass: %s' % (glbRspClass)
print 'glbRspClassN: %d' % (glbRspClassN)
print 'glbPickleFile: %s' % (glbPickleFile)
# glbDataURL = 'http://yaroslavvb.com/upload/notMNIST/'
# glbImg['size'] = 14
# %run img_glbSpec_SFDD_ImgSz_64.py
# %run img_utils.py
glbObsFitIdn, glbObsFitFtr, glbObsFitRsp, \
glbObsVldIdn, glbObsVldFtr, glbObsVldRsp, \
glbObsNewIdn, glbObsNewFtr, glbObsNewRsp, \
_, _ = myimportDbs(glbPickleFile['data'])
print('Fit set:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print('Vld set:', len(glbObsVldIdn), glbObsVldFtr.shape, glbObsVldRsp.shape)
# print('Trn set:', len(glbObsTrnIdn), glbObsTrnFtr.shape, glbObsTrnRsp.shape)
print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape, glbObsNewRsp.shape)
# print globals().keys().index('glbObsFitIdn')
# with open(glbPickleFile['data'], 'rb') as f:
# save = pickle.load(f)
# glbObsFitIdn = save['glbObsFitIdn']
# glbObsFitFtr = save['glbObsFitFtr']
# glbObsFitRsp = save['glbObsFitRsp']
# glbObsVldIdn = save['glbObsVldIdn']
# glbObsVldFtr = save['glbObsVldFtr']
# glbObsVldRsp = save['glbObsVldRsp']
# glbObsTrnIdn = save['glbObsTrnIdn']
# glbObsTrnFtr = save['glbObsTrnFtr']
# glbObsTrnRsp = save['glbObsTrnRsp']
# glbObsNewIdn = save['glbObsNewIdn']
# glbObsNewFtr = save['glbObsNewFtr']
# glbObsNewRsp = save['glbObsNewRsp']
# del save # hint to help gc free up memory
# print('Fit set:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
# print('Vld set:', len(glbObsVldIdn), glbObsVldFtr.shape, glbObsVldRsp.shape)
# print('Trn set:', len(glbObsTrnIdn), glbObsTrnFtr.shape, glbObsTrnRsp.shape)
# print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape, glbObsNewRsp.shape)
Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.
Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.
Optional question: train an off-the-shelf model on all the data!
# print glbObsTrnFtr[0:3,:,:]
# print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2]))
# print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])).shape
# from sklearn import metrics, linear_model
# import pandas as pd
# from datetime import datetime, time
print (glbObsNewRsp > -1).any()
glbObsFitFtrRsh = np.reshape(glbObsFitFtr,
(glbObsFitFtr.shape[0], glbObsFitFtr.shape[1] * glbObsFitFtr.shape[2]))
glbObsVldFtrRsh = np.reshape(glbObsVldFtr,
(glbObsVldFtr.shape[0], glbObsVldFtr.shape[1] * glbObsVldFtr.shape[2]))
glbObsNewFtrRsh = np.reshape(glbObsNewFtr,
(glbObsNewFtr.shape[0], glbObsNewFtr.shape[1] * glbObsNewFtr.shape[2]))
print('Fit Ftr Reshaped:', glbObsFitFtrRsh.shape)
print('Vld Ftr Reshaped:', glbObsVldFtrRsh.shape)
print('New Ftr Reshaped:', glbObsNewFtrRsh.shape)
print type(glbObsFitRsp)
print glbObsFitRsp.shape
print glbObsVldRsp.shape
glbObsTrnIdn = glbObsFitIdn + glbObsVldIdn
glbObsTrnFtr = np.vstack((glbObsFitFtr, glbObsVldFtr))
glbObsTrnRsp = np.concatenate((glbObsFitRsp, glbObsVldRsp))
print('Trn set:', len(glbObsTrnIdn), glbObsTrnFtr.shape, glbObsTrnRsp.shape)
glbObsTrnFtrRsh = np.reshape(glbObsTrnFtr,
(glbObsTrnFtr.shape[0], glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2]))
print('Trn Ftr Reshaped:', glbObsTrnFtrRsh.shape)
print type(glbObsVldRsp)
print glbObsVldRsp.shape
print type(glbObsVldRsp[0:1])
# def fitMdl(nObsFit = 50):
# mdl = linear_model.LogisticRegression(verbose = 1)
# mdl.fit(np.reshape(glbObsTrnFtr[0:nObsFit,:,:], \
# (nObsFit, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), \
# glbObsTrnRsp[0:nObsFit])
# print mdl.get_params()
# print mdl.coef_.shape
# print ' coeff stats:'
# for lblIx in xrange(len(dspLabels)):
# print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % \
# (dspLabels[lblIx], \
# mdl.coef_[lblIx,:].argmin() / glbImgSz, \
# mdl.coef_[lblIx,:].argmin() % glbImgSz, \
# mdl.coef_[lblIx,:].min(), \
# mdl.coef_[lblIx,:].argmax() / glbImgSz, \
# mdl.coef_[lblIx,:].argmax() % glbImgSz, \
# mdl.coef_[lblIx,:].max())
# lclObsFitRspPred = mdl.predict(np.reshape(glbObsTrnFtr[0:nObsFit,:,:], \
# (nObsFit , glbImgSz ** 2)))
# accFit = metrics.accuracy_score(lclObsFitRspPred, glbObsTrnRsp[0:nObsFit])
# print ' accuracy train:%0.4f' % (accFit)
# print metrics.confusion_matrix(glbObsTrnRsp[0:nObsFit], lclObsFitRspPred)
# valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, \
# (glbObsVldFtr.shape[0], glbImgSz ** 2)))
# accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
# print ' accuracy valid:%0.4f' % (accuracy_valid)
# print metrics.confusion_matrix(glbObsVldRsp , valid_pred_labels)
# test_pred_labels = mdl.predict(np.reshape(glbObsNewFtr, \
# (glbObsNewFtr.shape[0], glbImgSz ** 2)))
# accuracy_test = metrics.accuracy_score( test_pred_labels, glbObsNewRsp)
# print ' accuracy test:%0.4f' % (accuracy_test)
# test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp, test_pred_labels), \
# index = dspLabels, columns = dspLabels)
# print test_conf
# return(mdl, (accFit, accuracy_valid, accuracy_test))
# try:
# glbMdlDf = pd.read_pickle(glbPickleFile['models'])
# except IOError, e:
# print(e)
# if e.errno == 2:
# glbMdlDf = pd.DataFrame()
# else:
# raise
# print(glbMdlDf)
# %run img_glbSpec_SFDD_ImgSz_64.py
%run img_utils.py
# import img_utils
def fitMdlLgtRgrSkl(lclObsFitFtrRsh, lclObsFitRsp, nObsFit = 50,
verbose = False):
from sklearn import metrics as skl_metrics
from sklearn.linear_model \
import LogisticRegression as skl_LogisticRegression
print('\nLogistic Regression (sklearn): nObsFit: %5d; ' % ( \
nObsFit))
startTm = datetime.now()
mdlDf = pd.DataFrame({'id': 'LgtRgr.skl',
'nObsFit': [nObsFit]
})
mdl = skl_LogisticRegression(verbose = verbose)
mdl.fit(lclObsFitFtrRsh[:nObsFit], lclObsFitRsp[:nObsFit])
mdlDf['model'] = mdl
lclObsFitRspPred = mdl.predict(lclObsFitFtrRsh[:nObsFit])
lclObsFitRspPredProba = mdl.predict_proba(lclObsFitFtrRsh[:nObsFit])
accFit = skl_metrics.accuracy_score(lclObsFitRspPred,
lclObsFitRsp[:nObsFit])
logLossFit = skl_metrics.log_loss(lclObsFitRsp[:nObsFit],
lclObsFitRspPredProba)
if verbose:
print '\n Fit accuracy:%0.4f' % (accFit)
print ' Fit logLoss:%0.4f' % (logLossFit)
# print metrics.confusion_matrix(glbObsFitRsp[0:nObsFit],
# lclObsFitRspPred)
lclObsVldRspPred = mdl.predict(glbObsVldFtrRsh)
lclObsVldRspPredProba = mdl.predict_proba(glbObsVldFtrRsh)
mdlDf['accVld'] = accVld = skl_metrics.accuracy_score(lclObsVldRspPred,
glbObsVldRsp)
mdlDf['logLossVld'] = logLossVld = skl_metrics.log_loss(glbObsVldRsp,
lclObsVldRspPredProba)
if verbose:
print '\n Vld accuracy:%0.4f' % (accVld)
print ' Vld logLoss:%0.4f' % (logLossVld)
print skl_metrics.confusion_matrix(glbObsVldRsp, lclObsVldRspPred)
yLbl = [glbRspClassDesc[glbRspClass[ix]] + ':' +
glbRspClass[ix] + ':actl' for ix in xrange(glbRspClassN)]
xLbl = ['pred:' + glbRspClass[ix] + ':' + glbRspClassDesc[glbRspClass[ix]]
for ix in xrange(glbRspClassN)]
# print labels
plt.matshow(skl_metrics.confusion_matrix(glbObsVldRsp,
lclObsVldRspPred), cmap='Reds',
interpolation='none')
plt.yticks(np.arange(10), yLbl)
plt.xticks(np.arange(10), xLbl, rotation=90);
# print 'glbRspClass: %s' % (glbRspClass)
mydisplayImagePredictions(mdl,
glbObsVldIdn, glbObsVldFtr, glbObsVldRsp, lclObsVldRspPredProba,
glbRspClass, glbRspClassDesc)
# nSmpImgs = 5
# plot_occlusion(mdl, np.reshape(glbObsVldFtr[:nSmpImgs],
# (nSmpImgs, 1, glbObsVldFtr.shape[1], glbObsVldFtr.shape[2])),
# glbObsVldRsp[:nSmpImgs])
# plt.show()
lclObsNewRspPred = mdl.predict(glbObsNewFtrRsh)
lclObsNewRspPredProba = mdl.predict_proba(glbObsNewFtrRsh)
if (glbObsNewRsp > -1).any():
mdlDf['accNew'] = accNew = skl_metrics.accuracy_score(lclObsNewRspPred,
glbObsNewRsp)
mdlDf['logLossNew'] = logLossNew = skl_metrics.log_loss(glbObsNewRsp,
lclObsNewRspPredProba)
if verbose:
print '\n New accuracy:%0.4f' % (accNew)
print ' New logLoss:%0.4f' % (logLossNew)
print skl_metrics.confusion_matrix(glbObsNewRsp, lclObsNewRspPred)
print '\n lclObsNewRspPredProba:'
print np.array_str(lclObsNewRspPredProba[:5, :],
precision=4, suppress_small=True)
clsKntDct = {'clsKnt' : np.unique(lclObsNewRspPred, return_counts = True)}
mdlDf['predNew'] = None
mdlDf.set_value(0, 'predNew', clsKntDct)
if verbose:
print '\n New prediction knts:'
print mdlDf['predNew'][0]
# print mdl.get_params()
# print mdl.coef_.shape
# print ' coeff stats:'
# for lblIx in xrange(len(dspLabels)):
# print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % \
# (dspLabels[lblIx], \
# mdl.coef_[lblIx,:].argmin() / glbImgSz, \
# mdl.coef_[lblIx,:].argmin() % glbImgSz, \
# mdl.coef_[lblIx,:].min(), \
# mdl.coef_[lblIx,:].argmax() / glbImgSz, \
# mdl.coef_[lblIx,:].argmax() % glbImgSz, \
# mdl.coef_[lblIx,:].max())
mdlDf['model'] = mdl
mdlDf['duration'] = (datetime.now() - startTm).seconds
print(' duration: %.2d seconds' % (mdlDf['duration'][0]))
return(mdlDf, lclObsVldRspPredProba, lclObsNewRspPredProba)
mdlDf = pd.DataFrame()
thsDf, thsObsVldRspPredProba, thsObsNewRspPredProba = fitMdlLgtRgrSkl(
glbObsFitFtrRsh, glbObsFitRsp, nObsFit = 100, verbose = True)
mdlDf = mdlDf.append(thsDf)
# To check if model results are deterministic
thsDf, thsObsVldRspPredProba, thsObsNewRspPredProba = fitMdlLgtRgrSkl(
glbObsFitFtrRsh, glbObsFitRsp, nObsFit = 100)
mdlDf = mdlDf.append(thsDf)
print '\nmdlDf: '
print(mdlDf)
print glbPickleFile['models']
glbMdlDf = None
# glbMdlDf = pd.DataFrame()
%run img_utils.py
srchParamsDct = {
'nObsFit' : [100, 1000, 5000, 10000, 15000, glbObsFitFtr.shape[0]]
# 'nObsFit' = [100, 1000, 5000, 10000, glbObsFitFtr.shape[0]]
}
jnk = mysearchParams(fitMdlLgtRgrSkl, srchParamsDct = srchParamsDct,
curResultsDf = glbMdlDf,
mode = 'displayonly',
save_filepathname = glbPickleFile['models'],
lclObsFitFtrRsh = glbObsFitFtrRsh,
lclObsFitRsp = glbObsFitRsp)
# thsDf, thsObsVldRspPredProba, thsObsNewRspPredProba = fitMdlLgtRgrSkl(
# glbObsFitFtrRsh, glbObsFitRsp, nObsFit = 100)
# for nObsFit in nObsFitSearch:
# if (glbMdlDf.shape[0] == 0) or \
# (glbMdlDf[(glbMdlDf.nObsFit == nObsFit )].shape[0] == 0):
# mdlDf = fitMdlLgtRgr(glbObsFitFtr, glbObsFitRsp, nObsFit = nObsFit, verbose = False)
# glbMdlDf = glbMdlDf.append(mdlDf)
#fitMdlLgtRgr(nObsFit, verbose = False)
%run img_utils.py
glbMdlDf = mysearchParams(fitMdlLgtRgrSkl, srchParamsDct = srchParamsDct,
curResultsDf = glbMdlDf,
mode = 'run',
sort_values = ['nObsFit', 'duration'],
sort_ascending = [False , True ],
save_filepathname = glbPickleFile['models'],
lclObsFitFtrRsh = glbObsFitFtrRsh,
lclObsFitRsp = glbObsFitRsp)
glbMdlDf['bestFit'] = False
glbMdlDf.ix[18077.0, 'bestFit'] = True
# glbMdlDf.ix[(18077.0, 79726.0, 346.0), 'bestFit'] = True
# 18077.0
print glbMdlDf[list(set(glbMdlDf.columns) - set(srchParamsDct.keys()))]
# print glbMdlDf[tstGetCorrObsDf.yRowsN >= 70000]
robjects.pandas2ri.activate()
pltRDf = robjects.conversion.py2ri(glbMdlDf)
# print(pltRDf)
pltRFn = robjects.r("""
source('~/Dropbox/datascience/R/myplot.R')
function(RDf, filename) {
mypltModelStats(RDf, c('accVld', 'logLossVld', 'duration'),
dim = c('nObsFit'),
scaleXFn = NULL,
#highLightIx = which.min(RDf$logLossVld),
highLightIx = which(RDf$bestFit == 'TRUE'),
title = NULL,
fileName = filename)
}
""")
pltRFn(pltRDf, 'img_02_fit_lgtRgrSkl_SFDD_glbMdlDf.png')
pltRFn = robjects.r("""
source('~/Dropbox/datascience/R/myplot.R')
function(RDf, filename) {
mypltModelStats(RDf, c('accVld', 'logLossVld'),
dim = c('nObsFit'),
scaleXFn = NULL,
#highLightIx = which.min(RDf$logLossVld),
highLightIx = which(RDf$bestFit == 'TRUE'),
title = NULL,
fileName = filename)
}
""")
pltRFn(pltRDf, 'img_02_fit_lgtRgrSkl_SFDD_glbMdlDf_logLossVld.png')
pltRFn = robjects.r("""
source('~/Dropbox/datascience/R/myplot.R')
function(RDf, filename) {
mypltModelStats(RDf, c('accVld'),
dim = c('nObsFit'),
scaleXFn = NULL,
#highLightIx = which.min(RDf$logLossVld),
highLightIx = which(RDf$bestFit == 'TRUE'),
title = NULL,
fileName = filename)
}
""")
pltRFn(pltRDf, 'img_02_fit_lgtRgrSkl_SFDD_glbMdlDf_accVld.png')
selMdlDf, selObsVldRspPredProba, selObsNewRspPredProba = fitMdlLgtRgrSkl(
glbObsFitFtrRsh, glbObsFitRsp,
nObsFit = glbObsFitFtrRsh.shape[0],
verbose = True)
selObsVldRspPred = np.argmax(selObsVldRspPredProba, axis = 1)
yLbl = [glbRspClassDesc[glbRspClass[ix]] + ':' +
glbRspClass[ix] + ':acl' for ix in xrange(glbRspClassN)]
xLbl = ['sel:' + glbRspClass[ix] + ':' + glbRspClassDesc[glbRspClass[ix]]
for ix in xrange(glbRspClassN)]
# print labels
from sklearn import metrics as skl_metrics
plt.matshow(skl_metrics.confusion_matrix(glbObsVldRsp,
selObsVldRspPred), cmap='Reds',
interpolation='none')
plt.yticks(np.arange(10), yLbl)
plt.xticks(np.arange(10), xLbl, rotation=90);
# To ensure Kaggle evaluation metric is same as sklearn.metrics.log_loss
def mygetMetricLogLoss(lclRspPredProba, lclRsp, verbose = False):
lclRspIndicator = np.zeros_like(lclRspPredProba)
#print glbObsVldRsp[10]
for cls in xrange(lclRspIndicator.shape[1]):
lclRspIndicator[lclRsp == cls, cls] = 1
# print np.unique(glbObsVldRsp, return_counts = True)
# print tmpObsVldRspIndicator.sum(axis = 0)
# print (tmpObsVldRspIndicator.sum(axis = 0) == np.unique(glbObsVldRsp, return_counts = True)[1])
#tmpObsVldRspIndicator[glbObsVldRsp == 3, 3] = 1
#tmpObsVldRspIndicator[0, glbObsVldRsp[0]]
# print lclRspIndicator[10:16, :]
# Scale proba to sum to 1 for each row
tmpRspPredProba = lclRspPredProba
sclRspPredProbaRowSum = tmpRspPredProba.sum(axis = 1)
# print np.abs(sclObsVldRspPredProbaRowSum - 1.0)[10:16]
# print 1e-05
# print np.abs(sclObsVldRspPredProbaRowSum - 1.0)[10:16] > 1e-05
sclRspPredProbaRowSumChk = (np.abs(sclRspPredProbaRowSum - 1.0) > 1e-15)
if (sclRspPredProbaRowSumChk.sum() > 0):
print 'row sums != 1 for %d obs' % (sclRspPredProbaRowSumChk.sum())
print sclRspPredProbaRowSum[sclRspPredProbaRowSumChk]
#print sclObsVldRspPredProbaRowSum[10:16]
sclRspPredProba = tmpRspPredProba / sclRspPredProbaRowSum
tmpRspPredProba = sclRspPredProba
# Bound proba to limit log fn outliers
bndRspPredProba = tmpRspPredProba
bndRspPredProba[bndRspPredProba > 1-1e-15] = 1-1e-15
bndRspPredProba[bndRspPredProba < 0+1e-15] = 0+1e-15
#print bndObsVldRspPredProba[10:16, :]
nModProba = (tmpRspPredProba != bndRspPredProba).sum()
if (nModProba > 0):
print 'minmax of probabilities modified %d cells' % (nModProba)
tmpRspPredProba = bndRspPredProba
#print (lclRspIndicator * np.log(tmpRspPredProba))[10:16]
logLossObs = (lclRspIndicator * np.log(tmpRspPredProba)).sum(axis = 1)
if verbose:
# print 'mygetMetricLogLoss: logLossObs shape: %s' % \
# str(logLossObs.shape)
print 'mygetMetricLogLoss: logLossObs outlier: %.4f; ix: %d' % \
(np.min(logLossObs), np.argmin(logLossObs))
logLoss = 0 - (logLossObs.sum() / tmpRspPredProba.shape[0])
return(logLoss)
selLogLossVld = mygetMetricLogLoss(selObsVldRspPredProba, glbObsVldRsp,
verbose = True)
assert abs(selLogLossVld - selMdlDf.ix[0, 'logLossVld']) < 0.0001, \
'not same: %.4f != %.4f' % \
(selLogLossVld, selMdlDf.ix[0, 'logLossVld'])
# from sklearn import metrics as skl_metrics
# print skl_metrics.log_loss(glbObsVldRsp, selObsVldRspPredProba)
print selObsVldRspPredProba[1903:1904]
print np.argmax(selObsVldRspPredProba[1903:1904], axis = 1)
print (np.argmax(selObsVldRspPredProba[1903:1904], axis = 1) == 0)
print (np.argmax(selObsVldRspPredProba[1903:1904], axis = 1) == 0).any()
%run img_utils.py
mydisplayImagePredictions(selMdlDf.ix[0, 'model'],
glbObsVldIdn[1903:1904], glbObsVldFtr[1903:1904],
glbObsVldRsp[1903:1904], selObsVldRspPredProba[1903:1904],
glbRspClass, glbRspClassDesc)
# mdlFinDf = fitMdlLgtRgr(glbObsTrnFtr, glbObsTrnRsp, nObsFit = glbObsTrnFtr.shape[0],
# verbose = True)
finMdlDf, finObsVldRspPredProba, finObsNewRspPredProba = fitMdlLgtRgrSkl(
glbObsTrnFtrRsh, glbObsTrnRsp,
nObsFit = glbObsTrnFtrRsh.shape[0],
verbose = True)
glbMdlDf = glbMdlDf.append(finMdlDf)
#mdlFinDf = mdlDf
glbMdlDf = glbMdlDf.set_index(srchParamsDct.keys(), drop = False)
print(glbMdlDf[list(set(glbMdlDf.columns) - set(srchParamsDct.keys()))])
%run img_utils.py
myexportDf(glbMdlDf, glbPickleFile['models'], save_drop_cols = None)
# glbMdlDf = mydspMdls(glbMdlDf)
# mysaveMdls()
Same outlier as in selMdlDf['model']
mydisplayImagePredictions(finMdlDf.ix[0, 'model'],
glbObsVldIdn[1903:1904], glbObsVldFtr[1903:1904],
glbObsVldRsp[1903:1904], finObsVldRspPredProba[1903:1904],
glbRspClass, glbRspClassDesc)
finLogLossVld = mygetMetricLogLoss(finObsVldRspPredProba, glbObsVldRsp,
verbose = True)
assert abs(finLogLossVld - finMdlDf.ix[0, 'logLossVld']) < 0.0001, \
'not same: %.4f != %.4f' % \
(finLogLossVld, finMdlDf.ix[0, 'logLossVld'])
tmpObsVldIx = 790
mydisplayImagePredictions(finMdlDf.ix[0, 'model'],
glbObsVldIdn[tmpObsVldIx:(tmpObsVldIx + 1)],
glbObsVldFtr[tmpObsVldIx:(tmpObsVldIx + 1)],
glbObsVldRsp[tmpObsVldIx:(tmpObsVldIx + 1)],
finObsVldRspPredProba[tmpObsVldIx:(tmpObsVldIx + 1)],
glbRspClass, glbRspClassDesc)
finObsVldRspPred = np.argmax(finObsVldRspPredProba, axis = 1)
yLbl = [glbRspClassDesc[glbRspClass[ix]] + ':' +
glbRspClass[ix] + ':acl' for ix in xrange(glbRspClassN)]
xLbl = ['fin:' + glbRspClass[ix] + ':' + glbRspClassDesc[glbRspClass[ix]]
for ix in xrange(glbRspClassN)]
# print labels
from sklearn import metrics as skl_metrics
plt.matshow(skl_metrics.confusion_matrix(glbObsVldRsp,
finObsVldRspPred), cmap='Reds',
interpolation='none')
plt.yticks(np.arange(10), yLbl)
plt.xticks(np.arange(10), xLbl, rotation=90);
selObsNewRspPred = np.argmax(selObsNewRspPredProba, axis = 1)
print np.unique(selObsNewRspPred, return_counts = True)
print np.unique(selObsNewRspPred, return_counts = True)[1] * 1.0 / \
np.unique(glbObsTrnRsp, return_counts = True)[1]
# tmpObsTrnRsp = np.argmax(glbObsTrnRsp, axis = 1)
# print np.unique(selObsNewRspPred, return_counts = True)[1] * 1.0 / \
# np.unique(tmpObsTrnRsp, return_counts = True)[1]
finObsNewRspPred = np.argmax(finObsNewRspPredProba, axis = 1)
print np.unique(finObsNewRspPred, return_counts = True)
print np.unique(finObsNewRspPred, return_counts = True)[1] * 1.0 / \
np.unique(glbObsTrnRsp, return_counts = True)[1]
# tmpObsTrnRsp = np.argmax(glbObsTrnRsp, axis = 1)
# print np.unique(finObsNewRspPred, return_counts = True)[1] * 1.0 / \
# np.unique(tmpObsTrnRsp, return_counts = True)[1]
mydisplayImagePredictions(selMdlDf.ix[0, 'model'],
glbObsNewIdn,
glbObsNewFtr,
selObsNewRspPred,
selObsNewRspPredProba,
glbRspClass, glbRspClassDesc)
%run img_utils.py
mydisplayImagePredictions(finMdlDf.ix[0, 'model'],
glbObsNewIdn,
glbObsNewFtr,
finObsNewRspPred,
finObsNewRspPredProba,
glbRspClass, glbRspClassDesc)
selLbl = [glbRspClassDesc[glbRspClass[ix]] + ':' +
glbRspClass[ix] + ':sel' for ix in xrange(glbRspClassN)]
finLbl = ['fin:' + glbRspClass[ix] + ':' + glbRspClassDesc[glbRspClass[ix]]
for ix in xrange(glbRspClassN)]
# print labels
from sklearn import metrics as skl_metrics
plt.matshow(skl_metrics.confusion_matrix(selObsNewRspPred,
finObsNewRspPred), cmap='Reds',
interpolation='none')
plt.yticks(np.arange(10), selLbl)
plt.xticks(np.arange(10), finLbl, rotation=90);
finSelProbaDffAbs = abs(finObsNewRspPredProba - selObsNewRspPredProba)
finSelProbaDffAbsObsSum = np.sum(finSelProbaDffAbs, axis = 1)
finSelDf = pd.DataFrame({
'idx': range(glbObsNewRsp.shape[0]),
'idn': glbObsNewIdn,
'proba.dff.abs.cls.sum' : finSelProbaDffAbsObsSum
})
finSelDf = finSelDf.sort_values('proba.dff.abs.cls.sum', ascending = False)
finSelDf.hist('proba.dff.abs.cls.sum')
plt.show()
print finSelDf[:10]
for ix in xrange(10):
imgFilePth = os.getcwd() + '/data/' + glbDataFile['newFoldersPth'] + '/' + \
glbObsNewIdn[finSelDf.iloc[ix]['idx']]
# print '\n %s:' % imgFilePth
print '\n %s:' % (glbObsNewIdn[finSelDf.iloc[ix]['idx']])
jpgfile = Image(imgFilePth, format = 'jpg',
width = glbImg['size'] * 4, height = glbImg['size'] * 4)
display(jpgfile)
selCls = np.argmax(selObsNewRspPredProba[finSelDf.iloc[ix]['idx'], :])
print 'Sel Proba for cls: %s; desc: %30s; proba: %0.4f' % \
(selCls, glbRspClassDesc['c' + str(selCls)],
np.max(selObsNewRspPredProba[finSelDf.iloc[ix]['idx'], :]))
finCls = np.argmax(finObsNewRspPredProba[finSelDf.iloc[ix]['idx'], :])
print 'Fin Proba for cls: %s; desc: %30s; proba: %0.4f' % \
(finCls, glbRspClassDesc['c' + str(finCls)],
np.max(finObsNewRspPredProba[finSelDf.iloc[ix]['idx'], :]))
# %run img_glbSpec_SFDD_ImgSz_64.py
print '\n selMdl:'
glbwriteSubmission(glbObsNewIdn, selObsNewRspPredProba,
'img_02_fit_lgtRgrSkl_SFDD_ImgSz_' + str(glbImg['size']) + '_sbmt_sel.csv')
print '\n finMdl:'
glbwriteSubmission(glbObsNewIdn, finObsNewRspPredProba,
'img_02_fit_lgtRgrSkl_SFDD_ImgSz_' + str(glbImg['size']) + '_sbmt_fin.csv')
print 'LeaderBoard metric for this sel submission: %0.5f' % (3.73153)
print 'LeaderBoard metric for this fin submission: %0.5f' % (3.37683)
print 'Best score yet:%s: %0.5f' % \
('img_02_fit_lgtRgr(Skl)_SFDD_(ImgSz_32_)sbmt(_fin).csv', 2.63892)